package cn.fr4nk.crawler.utils;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;

public class HtmlStringUtil {
	
	private static final String[] rmHtmlTagAndContentPtnStrs =  { "<(\\w+)>.+?</(\\1)>", "<(\\w+)\\s.+?</(\\1)>"};
	
	public static String rmHtmlTags(String html) {
		return Jsoup.parse(html).text();
	}
	
	public static void main(String[] args) {
		String tts = "中国<div>asdasd</div>人\"invType_CN\":\"自<div class=\"dp\">6Ieq54S25Lq66IKh5Lic</div>然人<div class=\"dp\">6Ieq54S25Lq66IKh5Lic</div>股东\"";
		String tt2 = CsvUtil.readString("c:/tmp/holder.json");
		System.out.println(rmHtmlTagAndContent(tts));
		System.out.println(rmHtmlTagAndContent(tt2));
	}
	
	public static String rmHtmlTagAndContent(String html) {
		String ret = html;
		for (String item: rmHtmlTagAndContentPtnStrs) {
			ret = ret.replaceAll(item, "");
		}
		return ret;	 
	}

	public static String rmHtmlTagAndContent(String html, String... ptnStrs) {
		String ret = html;
		for (String item: ptnStrs) {
			ret = ret.replaceAll(item, "");
		}
		return ret;	 
	}
	
	public static String getNotEmpty(String... strList) {
		for (String s: strList) {
			if (StringUtils.isNotEmpty(s)) {
				return s;
			}
		}
		return null;
	}
}
